In [1]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *

In [2]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [3]:
# Loading in the two corpuses
notebooks = [os.path.join('../hw_corpus', fname) for fname in os.listdir('../hw_corpus')]
hw_notebook_objs = [NotebookMiner(file) for file in notebooks]

In [4]:
person_to_notebooks = {}
for nb in hw_notebook_objs:
    person = nb.filename.split('/')[2].split('_')[0]
    if person not in person_to_notebooks:
        person_to_notebooks[person] = []
    person_to_notebooks[person].append(nb)

In [5]:
print(len([key for key in person_to_notebooks.keys()]))


56

In [6]:
print(len(os.listdir('../testbed/Final')))


176

Looks like there are 176 students in the Final directory and only 56 in the homework directory. Furthermore, there were actually 60 repos in the hw_corpus... 4 apparently have no notebooks. Representative example: cyriaquebrousse


In [7]:
max_hw_notebook_objs = []
for key in person_to_notebooks.keys():
    cur_max = 0
    max_nb = None
    for nb in person_to_notebooks[key]:
        if nb.get_number_cells() > cur_max:
            cur_max = nb.get_number_cells()
            max_nb = nb
    max_hw_notebook_objs.append(max_nb)

In [8]:
from nbminer.stats.multiple_summary import MultipleSummary
hw_summary = MultipleSummary(max_hw_notebook_objs)
final_summary = MultipleSummary(notebook_objs)
print("Number of Final notebooks: ", len(final_summary.summary_vec))
print("Number of Homework notebooks: ", len(hw_summary.summary_vec))
print("Average number of cells, Final: ", final_summary.average_number_of_cells())
print("Average number of cells, Homework: ", hw_summary.average_number_of_cells())
print("Average lines of code, Final: ", final_summary.average_lines_of_code())
print("Average lines of code, Homework: ", hw_summary.average_lines_of_code())


Number of Final notebooks:  177
Number of Homework notebooks:  56
Average number of cells, Final:  68.92090395480226
Average number of cells, Homework:  79.30357142857143
Average lines of code, Final:  271.3502824858757
Average lines of code, Homework:  392.69642857142856

In [9]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151dc5e630>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x10733dd68>
<nbminer.preprocess.get_imports.GetImports object at 0x151c32d518>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x151c32db00>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151c32d390>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x151dad7f60>

In [10]:
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_1')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))


Mean within group:  0.263525329937
STD within group:  0.0392584417538
Mean outside group:  0.243111132076
STD outside group:  0.0359253606179

In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x10a83e9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a305eda58>
<nbminer.preprocess.get_imports.GetImports object at 0x1a305edcf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a30d8eb00>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a30d8e7f0>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x1a30e812e8>

In [66]:
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)


0.321249430005
Out[66]:
[<matplotlib.lines.Line2D at 0x1a380bb198>]

In [49]:
template_counter = {'group_1':{}, 'group_2':{}}
for i in range(a.get_number_notebooks()):
    group = a.get_notebook(i).get_feature('import_name')
    for seg in a.get_notebook_segments(i):
        templ = seg.get_feature('template')
        if templ != None:
            if templ not in template_counter[group]:
                template_counter[group][templ] = 0
            template_counter[group][templ] += 1

In [58]:
for key in template_counter['group_1'].keys():
    print (template_counter['group_1'][key], template_counter['group_2'][key])


8317 14213
109 109
926 1660
938 1587
91 170
3077 4940
159 235
1874 3200
222 353
300 487
117 205
150 223
36 67
256 420
118 208
151 265
225 394
85 93
1139 2022
106 188
58 84
239 454
221 408
233 427
164 294
200 355
655 1248
370 676
146 214
63 99
82 139
75 149
89 177
94 168
63 107
203 360
551 1044
149 278
76 147
111 175
237 438
34 60
174 302
76 136
197 340
229 405
53 67
132 209
43 77
128 198
67 117
28 45
322 631
272 483
52 94
1 1
31 59
92 136
332 638
45 81
99 184
105 186
115 200
534 1059
101 185
115 205
37 64
83 153
11 16
67 122
155 291
63 122
105 198
151 281
89 156
82 126
9 14
14 26
10 10
10 17
1 1
46 91
66 131
2 2
4 4
2 2
16 30
23 43
22 41
10 19
5 9
194 388
67 134
1 2
1 2
3 6
1 2
1 2
1 2
1 2

In [67]:
percentages = []
total_sum_1 = 0
total_sum_2 = 0
for key in template_counter['group_1'].keys():
    v1 = template_counter['group_1'][key]
    v2 = template_counter['group_2'][key]
    total_sum_1 += v1
    total_sum_2 += v2
    arr = (v1/(v1+v2), key, v1+v2)
    percentages.append(arr)

In [68]:
print(total_sum_1/(total_sum_1+total_sum_2))


0.3661901685242018

In [47]:
# Smaller number -- more likely in group 2 (aka, final)
for el in sorted(percentages):
    if el[2] > 20:
        print(el)
        print (astor.to_source(ke.templates.get_random_example(el[1])))


(0.3333333333333333, 'template_70', 582)
var().magic('matplotlib inline')

(0.3333333333333333, 'template_89', 201)
sns.barplot(x='month', y='favorite_count', data=var)

(0.33458646616541354, 'template_45', 266)
var = [var.doc2bow(var) for var in var]

(0.33482142857142855, 'template_64', 224)
var = gensim.corpora.Dictionary(var)

(0.3350253807106599, 'template_68', 197)
var = plt.subplot(224)

(0.3352165725047081, 'template_28', 1593)
var = var.groupby(['hour'])[var].sum().reset_index()

(0.3357664233576642, 'template_77', 137)
var = RandomForestRegressor()

(0.3378803777544596, 'template_22', 953)
var = pd.read_json('eth_en.json')

(0.34054054054054056, 'template_73', 185)
var = LinearRegression()

(0.34080717488789236, 'template_52', 223)
var, var, var, var = train_test_split(var, var, test_size=0.4, random_state=415
    )

(0.3422680412371134, 'template_8', 970)
sum([(1) for var, var in var.items() if not var])

(0.3441933788754598, 'template_35', 1903)
plt.plot(var, var, label='EPFL')

(0.34444444444444444, 'template_29', 90)
var = var.agg(var)

(0.3448275862068966, 'template_75', 29)
def get_rt_and_fav_sum_of_hashtags(data):
    var = var.copy()
    var['hashtags'] = var['entities'].apply(var)
    var = var[['hashtags', 'retweet_count', 'favorite_count']]
    var = pd.DataFrame(var(var), columns=['hashtag', 'rt', 'fav'])
    var = []
    for var, var in var.groupby('hashtag'):
        var.append({'hashtag': var, 'rt': var.rt.sum(), 'fav': var.fav.sum()})
    return pd.DataFrame(var)

(0.3448773448773449, 'template_9', 693)
var.describe()

(0.34545454545454546, 'template_7', 1595)
var.drop('contributors', axis=1, inplace=True)

(0.3465346534653465, 'template_26', 303)
var[1, 1].set_title('ETHZ retweets per month')

(0.3475336322869955, 'template_24', 446)
var.legend(labels=['EPFL', 'ETH'])

(0.34782608695652173, 'template_14', 46)
def get_hashtags(df):
    var = []
    for var, var in var.iterrows():
        for var in var['entities'].get('hashtags'):
            var = var.get('text')
            if var not in var:
                var.append(var)
    return var

(0.3484848484848485, 'template_94', 66)
def add_year_month_hour(df):
    var['year'] = var.apply(var, axis=1)
    var['month'] = var.apply(var, axis=1)
    var['hour'] = var.apply(var, axis=1)

(0.3486590038314176, 'template_98', 261)
var = pd.to_datetime(var.created_at)

(0.34894613583138173, 'template_15', 427)
var = var.dropna()

(0.3492063492063492, 'template_4', 63)
var['created_at'].resample('A').count().plot(marker='o', color='r')

(0.34951456310679613, 'template_34', 103)
for var in var:
    var.append(str(var).split()[0].split('-')[1])

(0.34953703703703703, 'template_61', 432)
plt.hist(var, bins=var)

(0.3498233215547703, 'template_55', 283)
var.set_ylabel('Number', fontsize=12)

(0.35, 'template_67', 40)
def trend_by_year(df):
    var['year'] = var['created_at'].map(lambda x: var.year)
    var = var.groupby('year')
    var['id'].count().plot(kind='bar')
    plt.title('Number of tweets per year')
    plt.show()
    var['retweet_count'].sum().plot(kind='bar')
    plt.title('Number of retweets per year')
    plt.show()
    var['favorite_count'].sum().plot(kind='bar')
    plt.title('Number of favorites per year')
    plt.show()

(0.3511111111111111, 'template_33', 675)
var.fit(var, var)

(0.35135135135135137, 'template_12', 629)
var = var.bar(var + 0.2, var, width=0.2, color='r', align='center')

(0.3516949152542373, 'template_37', 236)
var.set_xlabel('Year', fontsize=12)

(0.353030303030303, 'template_53', 660)
plt.title("""EPFL: Histogram of Retweet Counts 
 Average per tweet: %.4s""" %
    (var.retweet_count.sum() / len(var)))

(0.3531468531468531, 'template_79', 286)
plt.subplots(figsize=(8, 6))

(0.35372848948374763, 'template_23', 1046)
plt.show()

(0.3544973544973545, 'template_63', 189)
var = sklearn.model_selection.cross_val_score(var, var, var, cv=10, scoring
    ='neg_mean_squared_error')

(0.3561643835616438, 'template_91', 146)
var = sorted(var.items(), key=operator.itemgetter(1), reverse=True)

(0.35714285714285715, 'template_80', 126)
plt.tight_layout()

(0.35807860262008734, 'template_74', 458)
plt.xlabel('Value of retweet count')

(0.3580819798917247, 'template_11', 2586)
var = var.Date.apply(lambda s: var - var.Date.tail(1)).values / (1000 * 
    1000 * 1000 * 60 * 60)

(0.35833333333333334, 'template_65', 120)
plt.suptitle('ETH months')

(0.3584905660377358, 'template_97', 212)
plt.xticks(())

(0.35877862595419846, 'template_87', 262)
var['hashtags'] = var.text.apply(lambda x: ' '.join([var.lower() for var in
    var.split() if '#' in var]))

(0.359375, 'template_72', 320)
np.all(var.isnull(), axis=1)

(0.36026490066225164, 'template_6', 755)
var = var.groupby(var['created_at'].dt.year)

(0.3603290098070231, 'template_1', 3161)
print(var)

(0.36036036036036034, 'template_27', 555)
plt.ylabel('# of Tweets')

(0.36054421768707484, 'template_66', 294)
var = set(stopwords.words('english'))

(0.3605683836589698, 'template_76', 563)
var.Race.value_counts()

(0.36082474226804123, 'template_51', 291)
print('Mean squared error: %.2f' % np.mean((var.predict(var) - var) ** 2))

(0.361198738170347, 'template_44', 634)
var = var.copy()

(0.3617021276595745, 'template_86', 94)
var['year'] = pd.Series([0] * var.shape[0], index=var.index)

(0.3619631901840491, 'template_56', 326)
var = pd.concat([var, var], axis=1)

(0.3629807692307692, 'template_38', 416)
var['created_at'] = var['created_at'].map(lambda x: var.hour)

(0.363265306122449, 'template_60', 245)
var.origin.unique()

(0.36335403726708076, 'template_41', 322)
var['year'] = [var.year for var in var.created_at.tolist()]

(0.36348949919224555, 'template_59', 619)
for var in var:
    if var % 500 == 0:
        print(var)
    var += 1
    var.append(var(var))

(0.3641304347826087, 'template_81', 184)
var.sample(5)

(0.36507936507936506, 'template_99', 315)
var = var.groupby(by=var.columns, axis=1, level=0).agg(sum)

(0.36554621848739494, 'template_31', 476)
plt.figure()

(0.36633663366336633, 'template_62', 101)
for var, var in var.items():
    print('***')
    print(var)
    print('Number of favorites: ' + str(var['favorite_count'].sum()))
    print('Number of retweets: ' + str(var['retweet_count'].sum()))
    print('\n')

(0.36685288640595903, 'template_21', 537)
def feature_importance(data, labels):
    var = ExtraTreesRegressor(n_estimators=250, random_state=0)
    var.fit(var, var)
    var = var.feature_importances_
    var = np.std([var.feature_importances_ for var in var.estimators_], axis=0)
    var = np.argsort(var)[::-1]
    print('Feature ranking:')
    for var in range(var.shape[1]):
        print('%d. feature %d (%f)' % (var + 1, var[var], var[var[var]]))
    plt.figure()
    plt.title('Feature importances')
    plt.bar(range(var.shape[1]), var[var], color='r', yerr=var[var], align=
        'center')
    plt.xticks(range(var.shape[1]), var)
    plt.xlim([-1, var.shape[1]])
    plt.show()

(0.36915224145583664, 'template_0', 22530)
var['hour'] = var.created_at.dt.hour

(0.36933385888845094, 'template_2', 5074)
var(var)

(0.37037037037037035, 'template_54', 27)
def normalize_text(text):
    var = re.sub(
        '((www\\.[^\\s]+)|(https?://[^\\s]+)|(pic\\.twitter\\.com/[^\\s]+))',
        '', var)
    var = re.sub('@[^\\s]+', '', var)
    var = re.sub('#([^\\s]+)', '', var)
    var = re.sub('[:;>?<=*+()/,\\-#!$%\\{˜|\\}\\[^_\\@\\]1234567890’‘]',
        ' ', var)
    var = re.sub('[\\d]', '', var)
    var = var.replace('.', '')
    var = var.replace("'", ' ')
    var = var.replace('"', ' ')
    return var

(0.37058823529411766, 'template_92', 170)
plt.imshow(var)

(0.37104072398190047, 'template_69', 221)
var = WordCloud().generate(var)

(0.3714851485148515, 'template_10', 2525)
var.head()

(0.378698224852071, 'template_20', 676)
pd.DataFrame(var, columns=['Favorites per tweet', 'Retweets per tweet']).plot(
    kind='bar')

(0.3811944091486658, 'template_58', 787)
for var in var.index:
    var = var.loc[var]['entities']['hashtags']
    if len(var) == 0:
        continue
    for var in var:
        if var['text'] not in var:
            var[var['text']] = {'retweet_count': 0, 'favorite_count': 0}
        var[var['text']]['retweet_count'] += var.loc[var]['retweet_count']
        var[var['text']]['favorite_count'] += var.loc[var]['favorite_count']

(0.3835616438356164, 'template_83', 73)
var = folium.Map(location=[np.mean(var.placeLatitude), np.mean(var.
    placeLongitude)], tiles='Cartodb Positron', zoom_start=6)

(0.3838094050143445, 'template_93', 8017)
var.axis('off')

(0.38608695652173913, 'template_90', 575)
print(len(var))

(0.3870967741935484, 'template_57', 341)
var.set_index(['createdAt'], inplace=True)

(0.3881118881118881, 'template_47', 286)
var.set_xticklabels(var.index.astype(str))

(0.3888888888888889, 'template_16', 162)
for var in var.index.values:
    var = var.index.str.contains(var, case=False, na=False)
    if len(var[var].index) == 1:
        var = np.append(var, var.index.get_loc(var[var].index[0]))
        var = np.append(var, var)

(0.391304347826087, 'template_36', 23)
var = [var('bahnhofstrasse', 47.377, 8.54, 2005, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2006, -1)[5]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2007, -1)[2]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2008, -1)[9]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2009, -1)[4]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2010, -1)[2]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2011, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2012, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2013, -1)[1]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2014, -1)[9]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2015, -1)[5]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2016, -1)[54]['url_m']]

(0.39263803680981596, 'template_78', 326)
var.created_at.groupby(var.created_at.dt.year).count().plot(kind='bar')

(0.3942307692307692, 'template_49', 208)
var.isnull().sum().append(var.isnull().sum())

(0.40214477211796246, 'template_25', 373)
var.reset_index(inplace=True)

(0.40350877192982454, 'template_82', 228)
var = var.sort_values(axis=0, ascending=False)[var > 15].index[1:]

(0.4035532994923858, 'template_40', 394)
var['n_at'] = [len(list(set({var.strip('@') for var in var.split() if var.
    startswith('#')}))) for var in var.text]

(0.40555555555555556, 'template_18', 360)
var = open('text_eth.csv', 'r', encoding='utf-8').readlines()

(0.4074074074074074, 'template_96', 27)
def format_individual_global_voting_profile(voting_unique):
    var = var.set_index(['ParlGroupName', 'Name'])[['Decision']]
    var = lambda x: np.sum(var == 1) / len(var)
    var = lambda x: np.sum(var == 2) / len(var)
    var = lambda x: np.sum(var == 3) / len(var)
    var = var.groupby(level=['ParlGroupName', 'Name']).agg({'Decision': {
        'Yes': var, 'No': var, 'Abstention': var}})
    var.columns = var.columns.droplevel(0)
    return var

(0.4084507042253521, 'template_85', 142)
var = list(map(lambda city: [var.Canton.values[0], str(var.Longitude.values
    [0]) + ',' + str(var.Latitude.values[0])], var))

(0.44166666666666665, 'template_30', 120)
for var, var in var.groupby('radio'):
    var = var.reset_index().drop(['date', 'tags', 'radio'], axis=1)
    var['artists'] = var['artists'].apply(lambda x: ', '.join(var))
    var = 0
    var = []
    for var, var in var.groupby('artists'):
        var = var + 1
        var.append(var['track'].unique().size)
    var = numpy.mean(var)
    print(var + ': ' + str(var) + ' artists, with ' + str(round(var, 1)) +
        ' mean unique songs per artist')

(0.47752808988764045, 'template_71', 178)
var.to_csv('emotion_season_data_LexiconBasedApproach/spring5.csv', index=None)

(0.5, 'template_48', 218)
var = pd.read_csv(var, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE,
    header=None, escapechar='\\', na_values='N', names=var)


In [28]:
import astor
print(astor.to_source(ke.templates.get_random_example('template_89')))
print(astor.to_source(ke.templates.get_random_example('template_64')))
print(astor.to_source(ke.templates.get_random_example('template_73')))
print(astor.to_source(ke.templates.get_random_example('template_5')))


var = sns.barplot(var.index, var.favorite_count)

var = corpora.Dictionary(var)

var = LinearRegression(n_jobs=-1)


In [ ]:


In [ ]: